# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from zhizhu.items import ZhizhuItem


class SchoolspiderSpider(CrawlSpider):
    name = 'schoolspider'
    allowed_domains = ['hbei.com.cn']
    start_urls = ['http://www.hbei.com.cn/news/news/newslist_1.html']

    rules = (
        Rule(LinkExtractor(allow=r'http://www.hbei.com.cn/news/news/newslist_\d+.html'), follow=True),
        Rule(LinkExtractor(allow=r'http://www.hbei.com.cn/news/news/news_\d+.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        item = ZhizhuItem()
        title = response.xpath('.//div[@class="content"]/h1/text()').extract()
        info = response.xpath('.//div[@class="content"]/div[@class="info"]').xpath('string(.)').extract()
        # 此方法与以上方法大致相同，唯一的区别是爬的文本会用逗号隔开
        # info = response.xpath('//div[@class="content"]/div[@class="info"]//text()').extract()
        article = response.xpath('.//div[@class="content"]/div[@class="article"]').xpath('string(.)').extract()
        item['title'] = title
        item['info'] = info
        item['article'] = article
        yield item
